Lab4 KessieSHEN

1.Read in the data

library(data.table)
if (!file.exists("met_all.gz"))
  download.file(
    url = "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",
    destfile = "met_all.gz",
    method   = "libcurl",
    timeout  = 60
    )
met <- data.table::fread("met_all.gz")

2.Prepare the Data

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
met <- met[temp>-17]
head(met)
   USAFID  WBAN  year month   day  hour   min   lat      lon  elev wind.dir
    <int> <int> <int> <int> <int> <int> <int> <num>    <num> <int>    <int>
1: 690150 93121  2019     8     1     0    56  34.3 -116.166   696      220
2: 690150 93121  2019     8     1     1    56  34.3 -116.166   696      230
3: 690150 93121  2019     8     1     2    56  34.3 -116.166   696      230
4: 690150 93121  2019     8     1     3    56  34.3 -116.166   696      210
5: 690150 93121  2019     8     1     4    56  34.3 -116.166   696      120
6: 690150 93121  2019     8     1     5    56  34.3 -116.166   696       NA
   wind.dir.qc wind.type.code wind.sp wind.sp.qc ceiling.ht ceiling.ht.qc
        <char>         <char>   <num>     <char>      <int>         <int>
1:           5              N     5.7          5      22000             5
2:           5              N     8.2          5      22000             5
3:           5              N     6.7          5      22000             5
4:           5              N     5.1          5      22000             5
5:           5              N     2.1          5      22000             5
6:           9              C     0.0          5      22000             5
   ceiling.ht.method sky.cond vis.dist vis.dist.qc vis.var vis.var.qc  temp
              <char>   <char>    <int>      <char>  <char>     <char> <num>
1:                 9        N    16093           5       N          5  37.2
2:                 9        N    16093           5       N          5  35.6
3:                 9        N    16093           5       N          5  34.4
4:                 9        N    16093           5       N          5  33.3
5:                 9        N    16093           5       N          5  32.8
6:                 9        N    16093           5       N          5  31.1
   temp.qc dew.point dew.point.qc atm.press atm.press.qc       rh
    <char>     <num>       <char>     <num>        <int>    <num>
1:       5      10.6            5    1009.9            5 19.88127
2:       5      10.6            5    1010.3            5 21.76098
3:       5       7.2            5    1010.6            5 18.48212
4:       5       5.0            5    1011.6            5 16.88862
5:       5       5.0            5    1012.7            5 17.38410
6:       5       5.6            5    1012.7            5 20.01540
library(dplyr)
key_variables <- c("temp", "rh", "wind.sp", "vis.dist", "dew.point", "lat", "lon", "elev")
for (var in key_variables) {
  met[get(var) %in% c(9999, 999), (var) := NA]
}
met[, date := as.Date(paste(year, month, day, sep = "-"))]
setDT(met)
met <- met[date >= as.Date("2019-08-01") & date <= as.Date("2019-08-07")]
met <- as.data.table(met)


# Compute mean values by station
means_by_station <- met[, .(
  temp_mean = mean(temp, na.rm = TRUE),
  rh_mean = mean(rh, na.rm = TRUE),
  wind_sp_mean = mean(wind.sp, na.rm = TRUE),
  vis_dist_mean = mean(vis.dist, na.rm = TRUE),
  dew_point_mean = mean(dew.point, na.rm = TRUE),
  lat_mean = mean(lat, na.rm = TRUE),
  lon_mean = mean(lon, na.rm = TRUE),
  elev_mean = mean(elev, na.rm = TRUE)
), by = .(USAFID)]

# Create a region variable for NW, SW, NE, SE
met[, region := fifelse(lat > 39.71 & lon < -98.00, "NE",
                        fifelse(lat > 39.71 & lon >= -98.00, "NW",
                        fifelse(lat <= 39.71 & lon < -98.00, "SE", "SW")))]

#met_avg[, elev.cat := fifelse(elev>252, "high", "low")]

3.Use geom_violin to examine the wind speed and dew point by region

library(ggplot2)

ggplot(met, aes(x = factor(1), y = wind.sp, fill = region)) +
  geom_violin(trim = TRUE) +
  facet_wrap(~ region) +  # Use facets to separate regions
  labs(title = "Wind Speed by Region",
       x = "Region",
       y = "Wind Speed") +
  theme_minimal() +
  theme(legend.position = "bottom")
Warning: Removed 8515 rows containing non-finite outside the scale range
(`stat_ydensity()`).

ggplot(met, aes(x = factor(1), y = dew.point, fill = region)) +
  geom_violin(trim = TRUE) +
  facet_wrap(~ region) +  # Use facets to separate regions
  labs(title = "Dew Point by Region",
       x = "Region",
       y = "Dew Point") +
  theme_minimal() +
  theme(legend.position = "bottom")
Warning: Removed 1227 rows containing non-finite outside the scale range
(`stat_ydensity()`).

region_colors <- c("NW" = "blue", "SW" = "green", "NE" = "red", "SE" = "purple")

4.Use geom_jitter with stat_smooth to examine the association between dew point and wind speed by region

#Deal with NA before coloring points by region
met_clean <- na.omit(met, cols = c("dew.point", "wind.sp", "region"))
ggplot(met_clean, aes(x = dew.point, y = wind.sp, color = region)) +
  geom_jitter(width = 0.3, height = 0.3) +  # Adds a small amount of noise to each point
  stat_smooth(method = "lm", se = FALSE) +  # Fit linear regression lines
  labs(title = "Association between Dew Point and Wind Speed by Region",
       x = "Dew Point (°C)", y = "Wind Speed (m/s)") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Southeast Region: This region often has higher dew points Northwest Region :Dew points may not correlate as strongly with wind speed

**5.Use geom_bar to create barplots of the weather stations by elevation category colored by region

#bar plot of weather stations by elevation category
library(ggplot2)
ggplot(met, aes(x =factor(1), fill = region)) +
  geom_bar(position = "dodge") +
  labs(title = "Weather Stations by Elevation Category", x = "Elevation", y = "Count") +
  scale_fill_brewer(palette = "Set1") +  
  theme_minimal()

a higher count of stations in the “Low” elevation category compared to High.

**6.Use stat_summary to examine mean dew point and wind speed by region with standard deviation error bars

ggplot(met_clean, aes(x = region, y = dew.point)) +
  stat_summary(fun.data = "mean_sdl", geom = "pointrange") +
  labs(title = "Mean Dew Point by Region", x = "Region", y = "Dew Point (°C)") +
  theme_minimal()

# Adding error bars for wind speed
ggplot(met_clean, aes(x = region, y = wind.sp)) +
  stat_summary(fun.data = "mean_sdl", geom = "pointrange") +
  labs(title = "Mean Wind Speed by Region", x = "Region", y = "Wind Speed (m/s)") +
  theme_minimal()

the average wind speeds can vary greatly depending on the geographical location of a region. Coastal areas, typically experience higher average wind speeds, inland areas, like the Midwest, often have lower average wind speeds

**7.Make a map showing the spatial trend in relative humidity in the US

library(leaflet)
met_clean_rh <- na.omit(met, cols = "rh")
pal <- colorNumeric(palette = "Blues", domain = met_clean_rh$rh)
leaflet(data = met_clean_rh) %>%
  addTiles() %>%
  addCircleMarkers(~lon, ~lat, color = ~pal(rh), radius = 3, 
                   popup = ~paste("Station:", USAFID, "<br>RH:", rh),
                   group = "Humidity") %>%
  addLegend(pal = pal, values = ~rh, title = "Relative Humidity (%)") %>%
  setView(lng = -98, lat = 39.5, zoom = 4)

high RH values (darker blue) in the Southeastern US and along the Coast,much lower RH values (lighter blue) in the desert regions of the Southwest.westward and go from the Midwest to the Great Plains, RH generally decrease.

**8. Use a ggplot extension

library(ggplot2)
library(ggridges)
library(data.table)
met_clean <- met[!is.na(dew.point) & !is.na(region), ]
ggplot(met_clean, aes(x = dew.point, y = region, fill = region)) +
  geom_density_ridges(scale = 1.5, alpha = 0.7) +
  labs(
    title = "Dew Point Distribution by Region",
    x = "Dew Point (°C)",
    y = "Region"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
Picking joint bandwidth of 0.389